從 WebService 收到的 XML 資料,.net 解析時出發錯誤「XmlException: ‘’ (十六進位值 0x08) 是無效的字元。」
直覺地執行將 0x08 字取代串為空白字元的指令,但執行後仍然是同樣的錯誤訊息…意外地取代失敗了!!!
再打開資料檔搜尋了 0x08 也沒找到任何字元…查了 ASCII 表,發現它是不可視的控制字元(\b,Backspace 字元)!好,再用 Regexp 寫了「\b」取代指令,執行後還是同樣的錯誤訊息!!!
太詭異了…時值萬聖節,難道是這時候出了靈異現象?想太多沒用…手動把資料抓下來後開文字檔慢慢排查,試了 ASCII 碼「\b」字元 2、8、16 進位各種寫法,還是找不到;問 ChatGpt 和 Google 有各種清除字元的手法,但就是濾不到這詭異的 0x08。
看資料檔是以UTF資料儲存,於是改用「UTF 0x08」搜尋…前幾頁仍然沒找到什麼結果,再改用「Unicode 0x08」搜尋後就在第一頁看到了 fileformat.info 這個網站列出 Backspace 字元全部的字元格式,赫然發現它竟然有 HTML 編碼「」和「」!
是了,就是它!XML 也會使用 HTML 編碼!試了幾種組合,抓到鬼了,就是「」在做怪!花了五、六個小時就卡在這個「」—以 HTML 編碼的 Backspace 字元…這就是與老系統打交道的宿命~~~總是卡在奇妙的地方好幾個小時間甚至幾天。
最後以最近在嘗試的 Functional Programming 寫法,寫個取代 HTML 編碼控制字元的副程式 :
/// <summary>
/// 清理無效的Unicode控制字元
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
private string CleanInvalidUnicodeChars(string text)
{
// 0 ~ 1F(31), 7F(127)
Enumerable.Range(0, 32).Union(Enumerable.Range(0x7F, 1)).ToList()
.ForEach(x =>
{
text = text.Replace($"&#x{x:x};", "");
text = text.Replace($"&#x{x:x2};", "");
});
return text;
}
以下是這次詭異狀況的程式碼:
using System;
using System.IO;
using System.Xml;
using System.Xml.Serialization;
namespace ConsoleApp1
{
internal class Program
{
static void Main(string[] args)
{
string xml = @"<?xml version=""1.0"" encoding=""utf-8""?>
<soap:Envelope xmlns:xsi=""http://www.w3.org/2001/XMLSchema-instance"" xmlns:xsd=""http://www.w3.org/2001/XMLSchema"" xmlns:soap=""http://schemas.xmlsoap.org/soap/envelope/"">
<soap:Body>
<GetCityResponse xmlns=""http://tempuri.org/"">
<GetCityResult>
<City>
<City_ID>07</City_ID>
<City_Name>彰化縣</City_Name>
</City>
</GetCityResult>
</GetCityResponse>
</soap:Body>
</soap:Envelope>";
XmlSerializer serializer = new XmlSerializer(typeof(Envelope), new XmlRootAttribute
{
ElementName = "Envelope",
Namespace = "http://schemas.xmlsoap.org/soap/envelope/"
});
using (StringReader reader = new StringReader(xml))
{
var envelope = (Envelope)serializer.Deserialize(reader);
var city = envelope.Body.Response.Result.City;
Console.WriteLine("County ID: {0}", city.City_ID);
Console.WriteLine("County Name: {0}", city.City_Name);
}
}
}
[XmlRoot(ElementName = "Envelope", Namespace = "http://schemas.xmlsoap.org/soap/envelope/")]
public class Envelope
{
[XmlElement(ElementName = "Body", Namespace = "http://schemas.xmlsoap.org/soap/envelope/")]
public Body Body { get; set; }
}
public class Body
{
[XmlElement(ElementName = "GetIndustryResponse", Namespace = "http://tempuri.org/")]
public GetCityResponse Response { get; set; }
}
[XmlRoot(ElementName = "GetCityResponse", Namespace = "http://tempuri.org/")]
public class GetCityResponse
{
[XmlElement(ElementName = "GetCityResponse")]
public GetCityResult Result { get; set; }
}
[XmlRoot(ElementName = "GetIndustryResult")]
public class GetCityResult
{
[XmlElement(ElementName = "City")]
public City City { get; set; }
}
[XmlRoot(ElementName = "City")]
public class City
{
public string City_ID { get; set; }
public string City_Name { get; set; }
}
}
但是換成用「XmlDocument.LoadXml()」寫法就不會報錯了,直接呈現「Backspace」往前刪除一個字元的效果:
using System;
using System.Xml;
namespace ConsoleApp1
{
internal class Program
{
static void Main(string[] args)
{
string xml = @"<?xml version=""1.0"" encoding=""utf-8""?>
<soap:Envelope xmlns:xsi=""http://www.w3.org/2001/XMLSchema-instance"" xmlns:xsd=""http://www.w3.org/2001/XMLSchema"" xmlns:soap=""http://schemas.xmlsoap.org/soap/envelope/"">
<soap:Body>
<GetCityResponse xmlns=""http://tempuri.org/"">
<GetCityResult>
<City>
<City_ID>07</City_ID>
<City_Name>彰化縣</City_Name>
</City>
</GetCityResult>
</GetCityResponse>
</soap:Body>
</soap:Envelope>";
XmlDocument doc = new XmlDocument();
doc.LoadXml(xml);
XmlNamespaceManager nsmgr = new XmlNamespaceManager(doc.NameTable);
nsmgr.AddNamespace("soap", "http://schemas.xmlsoap.org/soap/envelope/");
nsmgr.AddNamespace("tempuri", "http://tempuri.org/");
XmlNode cityNode = doc.SelectSingleNode("//tempuri:City", nsmgr);
if (cityNode != null)
{
string cityId = cityNode["City_ID"].InnerText;
string cityName = cityNode["City_Name"].InnerText;
Console.WriteLine("City ID: {0}", cityId);
Console.WriteLine("City Name: {0}", cityName);
}
}
}
}
執行結果:
City ID: 07
City Name:彰化縣
本文同步發表至我的Blog